Consider the data set USJudgeRatings in R
# loading the data set
data("USJudgeRatings")
# store in a shorter variable name USJR
USJR <- USJudgeRatings
# help command ? to retrieve metadata
?USJudgeRatings
# correlation matrix
pairs(USJR, col=6, pch=8)
# using the pipe-operator %>% to establish a pipeline of functions
USJR %>% summary()
## CONT INTG DMNR DILG
## Min. : 5.700 Min. :5.900 Min. :4.300 Min. :5.100
## 1st Qu.: 6.850 1st Qu.:7.550 1st Qu.:6.900 1st Qu.:7.150
## Median : 7.300 Median :8.100 Median :7.700 Median :7.800
## Mean : 7.437 Mean :8.021 Mean :7.516 Mean :7.693
## 3rd Qu.: 7.900 3rd Qu.:8.550 3rd Qu.:8.350 3rd Qu.:8.450
## Max. :10.600 Max. :9.200 Max. :9.000 Max. :9.000
## CFMG DECI PREP FAMI
## Min. :5.400 Min. :5.700 Min. :4.800 Min. :5.100
## 1st Qu.:7.000 1st Qu.:7.100 1st Qu.:6.900 1st Qu.:6.950
## Median :7.600 Median :7.700 Median :7.700 Median :7.600
## Mean :7.479 Mean :7.565 Mean :7.467 Mean :7.488
## 3rd Qu.:8.050 3rd Qu.:8.150 3rd Qu.:8.200 3rd Qu.:8.250
## Max. :8.700 Max. :8.800 Max. :9.100 Max. :9.100
## ORAL WRIT PHYS RTEN
## Min. :4.700 Min. :4.900 Min. :4.700 Min. :4.800
## 1st Qu.:6.850 1st Qu.:6.900 1st Qu.:7.700 1st Qu.:7.150
## Median :7.500 Median :7.600 Median :8.100 Median :7.800
## Mean :7.293 Mean :7.384 Mean :7.935 Mean :7.602
## 3rd Qu.:8.000 3rd Qu.:8.050 3rd Qu.:8.500 3rd Qu.:8.250
## Max. :8.900 Max. :9.000 Max. :9.100 Max. :9.200
USJR %>% glimpse()
## Rows: 43
## Columns: 12
## $ CONT <dbl> 5.7, 6.8, 7.2, 6.8, 7.3, 6.2, 10.6, 7.0, 7.3, 8.2, 7.0, 6.5, 6.7,…
## $ INTG <dbl> 7.9, 8.9, 8.1, 8.8, 6.4, 8.8, 9.0, 5.9, 8.9, 7.9, 8.0, 8.0, 8.6, …
## $ DMNR <dbl> 7.7, 8.8, 7.8, 8.5, 4.3, 8.7, 8.9, 4.9, 8.9, 6.7, 7.6, 7.6, 8.2, …
## $ DILG <dbl> 7.3, 8.5, 7.8, 8.8, 6.5, 8.5, 8.7, 5.1, 8.7, 8.1, 7.4, 7.2, 6.8, …
## $ CFMG <dbl> 7.1, 7.8, 7.5, 8.3, 6.0, 7.9, 8.5, 5.4, 8.6, 7.9, 7.3, 7.0, 6.9, …
## $ DECI <dbl> 7.4, 8.1, 7.6, 8.5, 6.2, 8.0, 8.5, 5.9, 8.5, 8.0, 7.5, 7.1, 6.6, …
## $ PREP <dbl> 7.1, 8.0, 7.5, 8.7, 5.7, 8.1, 8.5, 4.8, 8.4, 7.9, 7.1, 6.9, 7.1, …
## $ FAMI <dbl> 7.1, 8.0, 7.5, 8.7, 5.7, 8.0, 8.5, 5.1, 8.4, 8.1, 7.2, 7.0, 7.3, …
## $ ORAL <dbl> 7.1, 7.8, 7.3, 8.4, 5.1, 8.0, 8.6, 4.7, 8.4, 7.7, 7.1, 7.0, 7.2, …
## $ WRIT <dbl> 7.0, 7.9, 7.4, 8.5, 5.3, 8.0, 8.4, 4.9, 8.5, 7.8, 7.2, 7.1, 7.2, …
## $ PHYS <dbl> 8.3, 8.5, 7.9, 8.8, 5.5, 8.6, 9.1, 6.8, 8.8, 8.5, 8.4, 6.9, 8.1, …
## $ RTEN <dbl> 7.8, 8.7, 7.8, 8.7, 4.8, 8.6, 9.0, 5.0, 8.8, 7.9, 7.7, 7.2, 7.7, …
USJR %>% colnames()
## [1] "CONT" "INTG" "DMNR" "DILG" "CFMG" "DECI" "PREP" "FAMI" "ORAL" "WRIT"
## [11] "PHYS" "RTEN"
USJR
## CONT INTG DMNR DILG CFMG DECI PREP FAMI ORAL WRIT PHYS RTEN
## AARONSON,L.H. 5.7 7.9 7.7 7.3 7.1 7.4 7.1 7.1 7.1 7.0 8.3 7.8
## ALEXANDER,J.M. 6.8 8.9 8.8 8.5 7.8 8.1 8.0 8.0 7.8 7.9 8.5 8.7
## ARMENTANO,A.J. 7.2 8.1 7.8 7.8 7.5 7.6 7.5 7.5 7.3 7.4 7.9 7.8
## BERDON,R.I. 6.8 8.8 8.5 8.8 8.3 8.5 8.7 8.7 8.4 8.5 8.8 8.7
## BRACKEN,J.J. 7.3 6.4 4.3 6.5 6.0 6.2 5.7 5.7 5.1 5.3 5.5 4.8
## BURNS,E.B. 6.2 8.8 8.7 8.5 7.9 8.0 8.1 8.0 8.0 8.0 8.6 8.6
## CALLAHAN,R.J. 10.6 9.0 8.9 8.7 8.5 8.5 8.5 8.5 8.6 8.4 9.1 9.0
## COHEN,S.S. 7.0 5.9 4.9 5.1 5.4 5.9 4.8 5.1 4.7 4.9 6.8 5.0
## DALY,J.J. 7.3 8.9 8.9 8.7 8.6 8.5 8.4 8.4 8.4 8.5 8.8 8.8
## DANNEHY,J.F. 8.2 7.9 6.7 8.1 7.9 8.0 7.9 8.1 7.7 7.8 8.5 7.9
## DEAN,H.H. 7.0 8.0 7.6 7.4 7.3 7.5 7.1 7.2 7.1 7.2 8.4 7.7
## DEVITA,H.J. 6.5 8.0 7.6 7.2 7.0 7.1 6.9 7.0 7.0 7.1 6.9 7.2
## DRISCOLL,P.J. 6.7 8.6 8.2 6.8 6.9 6.6 7.1 7.3 7.2 7.2 8.1 7.7
## GRILLO,A.E. 7.0 7.5 6.4 6.8 6.5 7.0 6.6 6.8 6.3 6.6 6.2 6.5
## HADDEN,W.L.JR. 6.5 8.1 8.0 8.0 7.9 8.0 7.9 7.8 7.8 7.8 8.4 8.0
## HAMILL,E.C. 7.3 8.0 7.4 7.7 7.3 7.3 7.3 7.2 7.1 7.2 8.0 7.6
## HEALEY.A.H. 8.0 7.6 6.6 7.2 6.5 6.5 6.8 6.7 6.4 6.5 6.9 6.7
## HULL,T.C. 7.7 7.7 6.7 7.5 7.4 7.5 7.1 7.3 7.1 7.3 8.1 7.4
## LEVINE,I. 8.3 8.2 7.4 7.8 7.7 7.7 7.7 7.8 7.5 7.6 8.0 8.0
## LEVISTER,R.L. 9.6 6.9 5.7 6.6 6.9 6.6 6.2 6.0 5.8 5.8 7.2 6.0
## MARTIN,L.F. 7.1 8.2 7.7 7.1 6.6 6.6 6.7 6.7 6.8 6.8 7.5 7.3
## MCGRATH,J.F. 7.6 7.3 6.9 6.8 6.7 6.8 6.4 6.3 6.3 6.3 7.4 6.6
## MIGNONE,A.F. 6.6 7.4 6.2 6.2 5.4 5.7 5.8 5.9 5.2 5.8 4.7 5.2
## MISSAL,H.M. 6.2 8.3 8.1 7.7 7.4 7.3 7.3 7.3 7.2 7.3 7.8 7.6
## MULVEY,H.M. 7.5 8.7 8.5 8.6 8.5 8.4 8.5 8.5 8.4 8.4 8.7 8.7
## NARUK,H.J. 7.8 8.9 8.7 8.9 8.7 8.8 8.9 9.0 8.8 8.9 9.0 9.0
## O'BRIEN,F.J. 7.1 8.5 8.3 8.0 7.9 7.9 7.8 7.8 7.8 7.7 8.3 8.2
## O'SULLIVAN,T.J. 7.5 9.0 8.9 8.7 8.4 8.5 8.4 8.3 8.3 8.3 8.8 8.7
## PASKEY,L. 7.5 8.1 7.7 8.2 8.0 8.1 8.2 8.4 8.0 8.1 8.4 8.1
## RUBINOW,J.E. 7.1 9.2 9.0 9.0 8.4 8.6 9.1 9.1 8.9 9.0 8.9 9.2
## SADEN.G.A. 6.6 7.4 6.9 8.4 8.0 7.9 8.2 8.4 7.7 7.9 8.4 7.5
## SATANIELLO,A.G. 8.4 8.0 7.9 7.9 7.8 7.8 7.6 7.4 7.4 7.4 8.1 7.9
## SHEA,D.M. 6.9 8.5 7.8 8.5 8.1 8.2 8.4 8.5 8.1 8.3 8.7 8.3
## SHEA,J.F.JR. 7.3 8.9 8.8 8.7 8.4 8.5 8.5 8.5 8.4 8.4 8.8 8.8
## SIDOR,W.J. 7.7 6.2 5.1 5.6 5.6 5.9 5.6 5.6 5.3 5.5 6.3 5.3
## SPEZIALE,J.A. 8.5 8.3 8.1 8.3 8.4 8.2 8.2 8.1 7.9 8.0 8.0 8.2
## SPONZO,M.J. 6.9 8.3 8.0 8.1 7.9 7.9 7.9 7.7 7.6 7.7 8.1 8.0
## STAPLETON,J.F. 6.5 8.2 7.7 7.8 7.6 7.7 7.7 7.7 7.5 7.6 8.5 7.7
## TESTO,R.J. 8.3 7.3 7.0 6.8 7.0 7.1 6.7 6.7 6.7 6.7 8.0 7.0
## TIERNEY,W.L.JR. 8.3 8.2 7.8 8.3 8.4 8.3 7.7 7.6 7.5 7.7 8.1 7.9
## WALL,R.A. 9.0 7.0 5.9 7.0 7.0 7.2 6.9 6.9 6.5 6.6 7.6 6.6
## WRIGHT,D.B. 7.1 8.4 8.4 7.7 7.5 7.7 7.8 8.2 8.0 8.1 8.3 8.1
## ZARRILLI,K.J. 8.6 7.4 7.0 7.5 7.5 7.7 7.4 7.2 6.9 7.0 7.8 7.1
Wee see that the lawyers’ names are used to lable the rows. We can extract these and use them later for analysis. Will we spot underperformers?
lawyers <- rownames(USJR)
lawyers
## [1] "AARONSON,L.H." "ALEXANDER,J.M." "ARMENTANO,A.J." "BERDON,R.I."
## [5] "BRACKEN,J.J." "BURNS,E.B." "CALLAHAN,R.J." "COHEN,S.S."
## [9] "DALY,J.J." "DANNEHY,J.F." "DEAN,H.H." "DEVITA,H.J."
## [13] "DRISCOLL,P.J." "GRILLO,A.E." "HADDEN,W.L.JR." "HAMILL,E.C."
## [17] "HEALEY.A.H." "HULL,T.C." "LEVINE,I." "LEVISTER,R.L."
## [21] "MARTIN,L.F." "MCGRATH,J.F." "MIGNONE,A.F." "MISSAL,H.M."
## [25] "MULVEY,H.M." "NARUK,H.J." "O'BRIEN,F.J." "O'SULLIVAN,T.J."
## [29] "PASKEY,L." "RUBINOW,J.E." "SADEN.G.A." "SATANIELLO,A.G."
## [33] "SHEA,D.M." "SHEA,J.F.JR." "SIDOR,W.J." "SPEZIALE,J.A."
## [37] "SPONZO,M.J." "STAPLETON,J.F." "TESTO,R.J." "TIERNEY,W.L.JR."
## [41] "WALL,R.A." "WRIGHT,D.B." "ZARRILLI,K.J."
lecture and identify potential outliers. - Boxplots
# i = 1, 2, 3, ..., number of colums
for (i in 1:length(colnames(USJR))) {
# index for matrices or data frames-> [rows, cols]
# title for each boxplot
boxplot(USJR[, i], main= as.character(colnames(USJR)[i]),col=12, alpha=0.5)
}
for (i in 1:ncol(USJR)) {
if (i==1) {
x <- USJR[,i]
x_bar <- mean(x) # mean(USJR[, i])
sigma <- sd(x)
z <- (x-x_bar)/sigma
# create data frame from zscore obtained
z_scores <- data.frame(z)
}
else {
x <- USJR[,i]
x_bar <- mean(x)
sigma <- sd(x)
z <- (x-x_bar)/sigma
z_scores <- z_scores %>% cbind(z)
}
}
# renaming the column names by the corresponding variable
colnames(z_scores) <- colnames(USJR)
# adding back the lawyers' names to label the rows
rownames(z_scores) <- lawyers
#z_scores %>% View()
count the number of outlier ratings for each lawyer if \(-2.7<z<2.7\)
outlier_df <- ifelse(abs(z_scores) >= 2.8, 1, 0) %>% as.data.frame()
outlier_df
## CONT INTG DMNR DILG CFMG DECI PREP FAMI ORAL WRIT PHYS RTEN
## AARONSON,L.H. 0 0 0 0 0 0 0 0 0 0 0 0
## ALEXANDER,J.M. 0 0 0 0 0 0 0 0 0 0 0 0
## ARMENTANO,A.J. 0 0 0 0 0 0 0 0 0 0 0 0
## BERDON,R.I. 0 0 0 0 0 0 0 0 0 0 0 0
## BRACKEN,J.J. 0 0 1 0 0 0 0 0 0 0 0 0
## BURNS,E.B. 0 0 0 0 0 0 0 0 0 0 0 0
## CALLAHAN,R.J. 1 0 0 0 0 0 0 0 0 0 0 0
## COHEN,S.S. 0 0 0 1 0 0 0 0 0 0 0 0
## DALY,J.J. 0 0 0 0 0 0 0 0 0 0 0 0
## DANNEHY,J.F. 0 0 0 0 0 0 0 0 0 0 0 0
## DEAN,H.H. 0 0 0 0 0 0 0 0 0 0 0 0
## DEVITA,H.J. 0 0 0 0 0 0 0 0 0 0 0 0
## DRISCOLL,P.J. 0 0 0 0 0 0 0 0 0 0 0 0
## GRILLO,A.E. 0 0 0 0 0 0 0 0 0 0 0 0
## HADDEN,W.L.JR. 0 0 0 0 0 0 0 0 0 0 0 0
## HAMILL,E.C. 0 0 0 0 0 0 0 0 0 0 0 0
## HEALEY.A.H. 0 0 0 0 0 0 0 0 0 0 0 0
## HULL,T.C. 0 0 0 0 0 0 0 0 0 0 0 0
## LEVINE,I. 0 0 0 0 0 0 0 0 0 0 0 0
## LEVISTER,R.L. 0 0 0 0 0 0 0 0 0 0 0 0
## MARTIN,L.F. 0 0 0 0 0 0 0 0 0 0 0 0
## MCGRATH,J.F. 0 0 0 0 0 0 0 0 0 0 0 0
## MIGNONE,A.F. 0 0 0 0 0 0 0 0 0 0 1 0
## MISSAL,H.M. 0 0 0 0 0 0 0 0 0 0 0 0
## MULVEY,H.M. 0 0 0 0 0 0 0 0 0 0 0 0
## NARUK,H.J. 0 0 0 0 0 0 0 0 0 0 0 0
## O'BRIEN,F.J. 0 0 0 0 0 0 0 0 0 0 0 0
## O'SULLIVAN,T.J. 0 0 0 0 0 0 0 0 0 0 0 0
## PASKEY,L. 0 0 0 0 0 0 0 0 0 0 0 0
## RUBINOW,J.E. 0 0 0 0 0 0 0 0 0 0 0 0
## SADEN.G.A. 0 0 0 0 0 0 0 0 0 0 0 0
## SATANIELLO,A.G. 0 0 0 0 0 0 0 0 0 0 0 0
## SHEA,D.M. 0 0 0 0 0 0 0 0 0 0 0 0
## SHEA,J.F.JR. 0 0 0 0 0 0 0 0 0 0 0 0
## SIDOR,W.J. 0 0 0 0 0 0 0 0 0 0 0 0
## SPEZIALE,J.A. 0 0 0 0 0 0 0 0 0 0 0 0
## SPONZO,M.J. 0 0 0 0 0 0 0 0 0 0 0 0
## STAPLETON,J.F. 0 0 0 0 0 0 0 0 0 0 0 0
## TESTO,R.J. 0 0 0 0 0 0 0 0 0 0 0 0
## TIERNEY,W.L.JR. 0 0 0 0 0 0 0 0 0 0 0 0
## WALL,R.A. 0 0 0 0 0 0 0 0 0 0 0 0
## WRIGHT,D.B. 0 0 0 0 0 0 0 0 0 0 0 0
## ZARRILLI,K.J. 0 0 0 0 0 0 0 0 0 0 0 0
#rsums <- outlier_df %>% rowSums() %>% as.vector()
#outlier_df <- outlier_df %>% cbind(rsums, lawyers) %>% as.data.frame()
outlier_df <- outlier_df %>% mutate(outlier_df,
rsums = (CONT + INTG + DMNR + DILG + CFMG + DECI + PREP +
FAMI + ORAL + WRIT +PHYS + RTEN))
outlier_df
## CONT INTG DMNR DILG CFMG DECI PREP FAMI ORAL WRIT PHYS RTEN
## AARONSON,L.H. 0 0 0 0 0 0 0 0 0 0 0 0
## ALEXANDER,J.M. 0 0 0 0 0 0 0 0 0 0 0 0
## ARMENTANO,A.J. 0 0 0 0 0 0 0 0 0 0 0 0
## BERDON,R.I. 0 0 0 0 0 0 0 0 0 0 0 0
## BRACKEN,J.J. 0 0 1 0 0 0 0 0 0 0 0 0
## BURNS,E.B. 0 0 0 0 0 0 0 0 0 0 0 0
## CALLAHAN,R.J. 1 0 0 0 0 0 0 0 0 0 0 0
## COHEN,S.S. 0 0 0 1 0 0 0 0 0 0 0 0
## DALY,J.J. 0 0 0 0 0 0 0 0 0 0 0 0
## DANNEHY,J.F. 0 0 0 0 0 0 0 0 0 0 0 0
## DEAN,H.H. 0 0 0 0 0 0 0 0 0 0 0 0
## DEVITA,H.J. 0 0 0 0 0 0 0 0 0 0 0 0
## DRISCOLL,P.J. 0 0 0 0 0 0 0 0 0 0 0 0
## GRILLO,A.E. 0 0 0 0 0 0 0 0 0 0 0 0
## HADDEN,W.L.JR. 0 0 0 0 0 0 0 0 0 0 0 0
## HAMILL,E.C. 0 0 0 0 0 0 0 0 0 0 0 0
## HEALEY.A.H. 0 0 0 0 0 0 0 0 0 0 0 0
## HULL,T.C. 0 0 0 0 0 0 0 0 0 0 0 0
## LEVINE,I. 0 0 0 0 0 0 0 0 0 0 0 0
## LEVISTER,R.L. 0 0 0 0 0 0 0 0 0 0 0 0
## MARTIN,L.F. 0 0 0 0 0 0 0 0 0 0 0 0
## MCGRATH,J.F. 0 0 0 0 0 0 0 0 0 0 0 0
## MIGNONE,A.F. 0 0 0 0 0 0 0 0 0 0 1 0
## MISSAL,H.M. 0 0 0 0 0 0 0 0 0 0 0 0
## MULVEY,H.M. 0 0 0 0 0 0 0 0 0 0 0 0
## NARUK,H.J. 0 0 0 0 0 0 0 0 0 0 0 0
## O'BRIEN,F.J. 0 0 0 0 0 0 0 0 0 0 0 0
## O'SULLIVAN,T.J. 0 0 0 0 0 0 0 0 0 0 0 0
## PASKEY,L. 0 0 0 0 0 0 0 0 0 0 0 0
## RUBINOW,J.E. 0 0 0 0 0 0 0 0 0 0 0 0
## SADEN.G.A. 0 0 0 0 0 0 0 0 0 0 0 0
## SATANIELLO,A.G. 0 0 0 0 0 0 0 0 0 0 0 0
## SHEA,D.M. 0 0 0 0 0 0 0 0 0 0 0 0
## SHEA,J.F.JR. 0 0 0 0 0 0 0 0 0 0 0 0
## SIDOR,W.J. 0 0 0 0 0 0 0 0 0 0 0 0
## SPEZIALE,J.A. 0 0 0 0 0 0 0 0 0 0 0 0
## SPONZO,M.J. 0 0 0 0 0 0 0 0 0 0 0 0
## STAPLETON,J.F. 0 0 0 0 0 0 0 0 0 0 0 0
## TESTO,R.J. 0 0 0 0 0 0 0 0 0 0 0 0
## TIERNEY,W.L.JR. 0 0 0 0 0 0 0 0 0 0 0 0
## WALL,R.A. 0 0 0 0 0 0 0 0 0 0 0 0
## WRIGHT,D.B. 0 0 0 0 0 0 0 0 0 0 0 0
## ZARRILLI,K.J. 0 0 0 0 0 0 0 0 0 0 0 0
## rsums
## AARONSON,L.H. 0
## ALEXANDER,J.M. 0
## ARMENTANO,A.J. 0
## BERDON,R.I. 0
## BRACKEN,J.J. 1
## BURNS,E.B. 0
## CALLAHAN,R.J. 1
## COHEN,S.S. 1
## DALY,J.J. 0
## DANNEHY,J.F. 0
## DEAN,H.H. 0
## DEVITA,H.J. 0
## DRISCOLL,P.J. 0
## GRILLO,A.E. 0
## HADDEN,W.L.JR. 0
## HAMILL,E.C. 0
## HEALEY.A.H. 0
## HULL,T.C. 0
## LEVINE,I. 0
## LEVISTER,R.L. 0
## MARTIN,L.F. 0
## MCGRATH,J.F. 0
## MIGNONE,A.F. 1
## MISSAL,H.M. 0
## MULVEY,H.M. 0
## NARUK,H.J. 0
## O'BRIEN,F.J. 0
## O'SULLIVAN,T.J. 0
## PASKEY,L. 0
## RUBINOW,J.E. 0
## SADEN.G.A. 0
## SATANIELLO,A.G. 0
## SHEA,D.M. 0
## SHEA,J.F.JR. 0
## SIDOR,W.J. 0
## SPEZIALE,J.A. 0
## SPONZO,M.J. 0
## STAPLETON,J.F. 0
## TESTO,R.J. 0
## TIERNEY,W.L.JR. 0
## WALL,R.A. 0
## WRIGHT,D.B. 0
## ZARRILLI,K.J. 0
# base R barplot
barplot(rowSums(outlier_df), cex.axis=2, las=2, col=2)
# ggplot
ggplot(data=outlier_df, aes(x=lawyers)) +
geom_bar(aes(weight=rsums), fill='purple', alpha=0.6) +
theme(axis.text.x = element_text(angle = 90, hjust=1))
outlier_df %>% summarize(lawyers, rsums) %>% arrange(-rsums) %>% head(10)
## lawyers rsums
## 1 BRACKEN,J.J. 1
## 2 CALLAHAN,R.J. 1
## 3 COHEN,S.S. 1
## 4 MIGNONE,A.F. 1
## 5 AARONSON,L.H. 0
## 6 ALEXANDER,J.M. 0
## 7 ARMENTANO,A.J. 0
## 8 BERDON,R.I. 0
## 9 BURNS,E.B. 0
## 10 DALY,J.J. 0
outliers <- boxplot(USJudgeRatings, plot = FALSE)$out
# Print the indices of the outliers
cat("Indices of potential outliers:", which(apply(USJudgeRatings, 2, function(x) x %in% outliers)), "\n")
## Indices of potential outliers: 6 7 20 24 51 78 91 94 109 121 127 137 152 180 195 220 223 250 266 278 309 323 324 349 352 358 366 367 379 392 395 409 422 435 444 453 465 478 481 496 508
outlier clustering with dbscan
library(dbscan)
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
dist_mtx <- dist(USJR)
# perform DBSCAN clustering
cluster <- dbscan(dist_mtx, eps = 1, minPts = 5)
# print the outlier indices
outlier_indices <- which(cluster$cluster == 0)
print(outlier_indices)
## [1] 1 2 5 6 7 8 10 11 12 13 14 16 17 18 19 20 21 22 23 24 26 29 30 31 32
## [26] 33 35 36 39 40 41 42 43
# initiating empty vectors to store values
s <- c()
k <- c()
for (i in 1:length(colnames(USJR))) {
skew <- skewness(USJR[, i]); s <- append(s, skew);
kurt <- kurtosis(USJR[, i]); k <- append(k, kurt);
}
#USJR <- USJR %>% rbind(s, k)
USJR
## CONT INTG DMNR DILG CFMG DECI PREP FAMI ORAL WRIT PHYS RTEN
## AARONSON,L.H. 5.7 7.9 7.7 7.3 7.1 7.4 7.1 7.1 7.1 7.0 8.3 7.8
## ALEXANDER,J.M. 6.8 8.9 8.8 8.5 7.8 8.1 8.0 8.0 7.8 7.9 8.5 8.7
## ARMENTANO,A.J. 7.2 8.1 7.8 7.8 7.5 7.6 7.5 7.5 7.3 7.4 7.9 7.8
## BERDON,R.I. 6.8 8.8 8.5 8.8 8.3 8.5 8.7 8.7 8.4 8.5 8.8 8.7
## BRACKEN,J.J. 7.3 6.4 4.3 6.5 6.0 6.2 5.7 5.7 5.1 5.3 5.5 4.8
## BURNS,E.B. 6.2 8.8 8.7 8.5 7.9 8.0 8.1 8.0 8.0 8.0 8.6 8.6
## CALLAHAN,R.J. 10.6 9.0 8.9 8.7 8.5 8.5 8.5 8.5 8.6 8.4 9.1 9.0
## COHEN,S.S. 7.0 5.9 4.9 5.1 5.4 5.9 4.8 5.1 4.7 4.9 6.8 5.0
## DALY,J.J. 7.3 8.9 8.9 8.7 8.6 8.5 8.4 8.4 8.4 8.5 8.8 8.8
## DANNEHY,J.F. 8.2 7.9 6.7 8.1 7.9 8.0 7.9 8.1 7.7 7.8 8.5 7.9
## DEAN,H.H. 7.0 8.0 7.6 7.4 7.3 7.5 7.1 7.2 7.1 7.2 8.4 7.7
## DEVITA,H.J. 6.5 8.0 7.6 7.2 7.0 7.1 6.9 7.0 7.0 7.1 6.9 7.2
## DRISCOLL,P.J. 6.7 8.6 8.2 6.8 6.9 6.6 7.1 7.3 7.2 7.2 8.1 7.7
## GRILLO,A.E. 7.0 7.5 6.4 6.8 6.5 7.0 6.6 6.8 6.3 6.6 6.2 6.5
## HADDEN,W.L.JR. 6.5 8.1 8.0 8.0 7.9 8.0 7.9 7.8 7.8 7.8 8.4 8.0
## HAMILL,E.C. 7.3 8.0 7.4 7.7 7.3 7.3 7.3 7.2 7.1 7.2 8.0 7.6
## HEALEY.A.H. 8.0 7.6 6.6 7.2 6.5 6.5 6.8 6.7 6.4 6.5 6.9 6.7
## HULL,T.C. 7.7 7.7 6.7 7.5 7.4 7.5 7.1 7.3 7.1 7.3 8.1 7.4
## LEVINE,I. 8.3 8.2 7.4 7.8 7.7 7.7 7.7 7.8 7.5 7.6 8.0 8.0
## LEVISTER,R.L. 9.6 6.9 5.7 6.6 6.9 6.6 6.2 6.0 5.8 5.8 7.2 6.0
## MARTIN,L.F. 7.1 8.2 7.7 7.1 6.6 6.6 6.7 6.7 6.8 6.8 7.5 7.3
## MCGRATH,J.F. 7.6 7.3 6.9 6.8 6.7 6.8 6.4 6.3 6.3 6.3 7.4 6.6
## MIGNONE,A.F. 6.6 7.4 6.2 6.2 5.4 5.7 5.8 5.9 5.2 5.8 4.7 5.2
## MISSAL,H.M. 6.2 8.3 8.1 7.7 7.4 7.3 7.3 7.3 7.2 7.3 7.8 7.6
## MULVEY,H.M. 7.5 8.7 8.5 8.6 8.5 8.4 8.5 8.5 8.4 8.4 8.7 8.7
## NARUK,H.J. 7.8 8.9 8.7 8.9 8.7 8.8 8.9 9.0 8.8 8.9 9.0 9.0
## O'BRIEN,F.J. 7.1 8.5 8.3 8.0 7.9 7.9 7.8 7.8 7.8 7.7 8.3 8.2
## O'SULLIVAN,T.J. 7.5 9.0 8.9 8.7 8.4 8.5 8.4 8.3 8.3 8.3 8.8 8.7
## PASKEY,L. 7.5 8.1 7.7 8.2 8.0 8.1 8.2 8.4 8.0 8.1 8.4 8.1
## RUBINOW,J.E. 7.1 9.2 9.0 9.0 8.4 8.6 9.1 9.1 8.9 9.0 8.9 9.2
## SADEN.G.A. 6.6 7.4 6.9 8.4 8.0 7.9 8.2 8.4 7.7 7.9 8.4 7.5
## SATANIELLO,A.G. 8.4 8.0 7.9 7.9 7.8 7.8 7.6 7.4 7.4 7.4 8.1 7.9
## SHEA,D.M. 6.9 8.5 7.8 8.5 8.1 8.2 8.4 8.5 8.1 8.3 8.7 8.3
## SHEA,J.F.JR. 7.3 8.9 8.8 8.7 8.4 8.5 8.5 8.5 8.4 8.4 8.8 8.8
## SIDOR,W.J. 7.7 6.2 5.1 5.6 5.6 5.9 5.6 5.6 5.3 5.5 6.3 5.3
## SPEZIALE,J.A. 8.5 8.3 8.1 8.3 8.4 8.2 8.2 8.1 7.9 8.0 8.0 8.2
## SPONZO,M.J. 6.9 8.3 8.0 8.1 7.9 7.9 7.9 7.7 7.6 7.7 8.1 8.0
## STAPLETON,J.F. 6.5 8.2 7.7 7.8 7.6 7.7 7.7 7.7 7.5 7.6 8.5 7.7
## TESTO,R.J. 8.3 7.3 7.0 6.8 7.0 7.1 6.7 6.7 6.7 6.7 8.0 7.0
## TIERNEY,W.L.JR. 8.3 8.2 7.8 8.3 8.4 8.3 7.7 7.6 7.5 7.7 8.1 7.9
## WALL,R.A. 9.0 7.0 5.9 7.0 7.0 7.2 6.9 6.9 6.5 6.6 7.6 6.6
## WRIGHT,D.B. 7.1 8.4 8.4 7.7 7.5 7.7 7.8 8.2 8.0 8.1 8.3 8.1
## ZARRILLI,K.J. 8.6 7.4 7.0 7.5 7.5 7.7 7.4 7.2 6.9 7.0 7.8 7.1
par(mfrow = c(3, 4))
for (i in 1:ncol(USJR)) {
densityPlot <- density(USJR[, i])
mu <- mean(USJR[, i])
plot(densityPlot, main = colnames(USJR)[i], col=2)
abline(v=mu, col='blue')
}
the stock prices for this company in 2022 (daily basis).
RBI <- read_csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise1/RBI.VI.csv")
## Rows: 4485 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): Open, High, Low, Close, Adj Close, Volume
## date (1): Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# creating columns for month and year
RBI <- RBI %>% mutate(month = months(RBI$Date),
year = year(RBI$Date))
RBI <- RBI %>% mutate(DailyRev =
(RBI$`Adj Close` - RBI$Open) * RBI$Volume)
ggplot(data=RBI, aes(x=Date, y=DailyRev)) +
geom_line(col='dark blue')
potential outliers. - Standardizing the data (z-scores)
RBI <- RBI %>% mutate(DailyRevNorm =
(DailyRev-mean(DailyRev))/sd(DailyRev))
ggplot(data=RBI, aes(x=Date, y=DailyRevNorm)) +
geom_line(col='dark blue')
removing outliers by z-scores
library(lubridate)
outliers_RBI <- RBI[abs(RBI$DailyRevNorm) > 3, ] # RBI %>% filter(DailyRevNorm > 5 | DailyRevNorm < 5)
m <- months(outliers_RBI$Date)
y <- year(outliers_RBI$Date)
outliers_RBI
## # A tibble: 89 × 11
## Date Open High Low Close Adj C…¹ Volume month year Daily…² Daily…³
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2005-04-25 37.3 39.9 37.3 39.6 20.0 7.45e6 April 2005 -1.29e8 -19.0
## 2 2005-04-26 39.6 39.6 38.4 38.8 19.6 1.42e6 April 2005 -2.83e7 -3.53
## 3 2005-04-27 38.4 38.6 38.3 38.4 19.4 1.63e6 April 2005 -3.10e7 -3.95
## 4 2006-02-28 68.6 68.9 66.4 67.0 33.9 1.86e6 Febr… 2006 -6.46e7 -9.12
## 5 2006-05-22 66.1 66.1 56.5 56.5 28.6 9.18e5 May 2006 -3.44e7 -4.48
## 6 2006-05-23 57.6 63.5 57.6 62.6 31.6 1.13e6 May 2006 -2.93e7 -3.70
## 7 2006-06-08 61.3 61.8 59.6 60.3 30.5 9.59e5 June 2006 -2.95e7 -3.73
## 8 2007-01-08 108. 108. 103. 105. 53.6 7.56e5 Janu… 2007 -4.14e7 -5.55
## 9 2007-02-27 106. 106. 98.2 98.2 50.0 9.42e5 Febr… 2007 -5.26e7 -7.28
## 10 2007-02-28 89.1 99.5 88.1 97.4 49.6 8.91e5 Febr… 2007 -3.52e7 -4.59
## # … with 79 more rows, and abbreviated variable names ¹`Adj Close`, ²DailyRev,
## # ³DailyRevNorm
barplot(table(m), las=2, col=3)
barplot(table(y), col="pink")
option to spot outliers with boxplot()
boxplot(RBI$DailyRev)$out
## [1] -129074750 -28288247 -31011464 -13057319 -13225256 -12283425
## [7] -12438510 -14405735 -14730996 -13563444 -14144867 -13228900
## [13] -14685421 -16900141 -17367965 -12114524 -19692963 -15782094
## [19] -14866920 -12613888 -64601702 -15005763 -12579335 -12846175
## [25] -20700257 -13241903 -15430144 -17431249 -13767322 -18833545
## [31] -19587314 -20112003 -15001034 -19348913 -19932006 -34431853
## [37] -29336773 -22711940 -19124261 -13979831 -17381912 -12376512
## [43] -13234784 -29539156 -12987009 -22543823 -18061791 -12422268
## [49] -15959999 -18660325 -18203786 -13442049 -15074129 -12238741
## [55] -13497191 -17914249 -13465363 -13695796 -16290995 -12479647
## [61] -12368022 -18483396 -24036889 -24786368 -24785533 -13741164
## [67] -15102844 -14573298 -23290236 -12308724 -12209884 -16287063
## [73] -21995843 -41416037 -23323775 -15061830 -16341171 -17142398
## [79] -14457131 -12194801 -12543620 -13950902 -17932176 -15375077
## [85] -52598388 -35174358 -27155848 -16330459 -28508474 -13040246
## [91] -26203584 -20435637 -14751098 -22645304 -17673491 -25907245
## [97] -15297639 -13180873 -14048164 -14226979 -26085966 -12519471
## [103] -14098858 -13456119 -12787608 -14662154 -18640542 -17306006
## [109] -18894652 -17287905 -17195844 -16129426 -13748119 -18237145
## [115] -19207953 -29947103 -13818184 -31748625 -36223591 -13450747
## [121] -17082614 -40923916 -25137593 -12253291 -13761394 -23776117
## [127] -26464784 -14346593 -27374072 -18342619 -24172872 -33160824
## [133] -18361023 -23090335 -16499626 -18115527 -22899928 -24328073
## [139] -14029240 -16553322 -13252918 -15422689 -15874294 -15033538
## [145] -13932170 -14252721 -14034882 -14509180 -23749601 -19538535
## [151] -17389858 -13267782 -12384916 -19251436 -12736969 -22603814
## [157] -37609605 -29711803 -21721699 -41827406 -57920846 -23561061
## [163] -12344702 -13853310 -14142403 -16204006 -15433024 -13752508
## [169] -19477078 -17168568 -16732015 -15119369 -14029752 -27891830
## [175] -13947503 -22959526 -19995471 -25027550 -16471197 -26790594
## [181] -28598505 -61495978 -34261135 -17465032 -14820541 -20085213
## [187] -17356350 -16125767 -12883101 -15534498 -15488827 -20251074
## [193] -17770289 -20581707 -30355892 -22744576 -14430166 -13135976
## [199] -13313342 -20838940 -14966050 -22635387 -34005454 -23218115
## [205] -20879587 -14373804 -18427344 -13453671 -26313051 -22506763
## [211] -17287251 -18125684 -15028421 -13166284 -14011018 -14510536
## [217] -12181995 -14418627 -12667876 -14654435 -21276634 -18807763
## [223] -16627286 -35896482 -19939193 -38321560 -26842460 -21391910
## [229] -31008546 -27064728 -21330785 -25170135 -27314612 -36463305
## [235] -27939493 -25754726 -13574441 -16668449 -21973572 -19901501
## [241] -17481750 -14761275 -18393312 -15928474 -12371537 -19696256
## [247] -22634046 -18483505 -15211204 -16857100 -15574940 -35811437
## [253] -29593964 -23345585 -59681005 -24207759 -35127333 -24585848
## [259] -15093792 -13510179 -14245385 -30363350 -21647011 -18406427
## [265] -25280718 -16694454 -15706419 -13855999 -20709696 -19933432
## [271] -15610560 -17091012 -14317396 -40949418 -29920223 -16129843
## [277] -12526243 -19066482 -12147111 -19578257 -14199551 -14602882
## [283] -18382416 -17027891 -18820804 -15418931 -13573159 -16955433
## [289] -20533090 -39226369 -32461708 -17816711 -20102270 -14889283
## [295] -60113061 -15617312 -18240480 -15324198 -15655317 -23364056
## [301] -12970807 -21942427 -16547946 -19757915 -13099864 -14769805
## [307] -12989053 -18672712 -15000216 -14798086 -18219696 -15554677
## [313] -16118614 -13471155 -16116226 -18841627 -14612056 -13542054
## [319] -17160462 -17810170 -26905650 -17924793 -31353144 -22087724
## [325] -17448694 -14508487 -15077593 -14234642 -18748717 -15739272
## [331] -27835243 -30730150 -24808842 -26407928 -15314180 -13912734
## [337] -21458040 -14727583 -18601521 -14239270 -14433164 -12267962
## [343] -16110419 -31334228 -23157312 -17303462 -16404794 -13769975
## [349] -28509417 -13198965 -12277363 -19329465 -22561259 -14362152
## [355] -14901149 -38229628 -25837708 -22715318 -45815955 -60252235
## [361] -46878395 -66673555 -17401111 -27101820 -13855716 -17560890
## [367] -20776772 -37041516 -27139561 -26198161 -26758741 -17222398
## [373] -17871200 -33075526 -45292806 -30738419 -18401678 -53743770
## [379] -37111349 -37859792 -24209564 -16202627 -12316678 -13939625
## [385] -13154578 -15148029 -20272461 -15262680 -14842008 -21068872
## [391] -33215762 -14880294 -15470824 -17545932 -19707413 -17173873
## [397] -17670485 -13604429 -13073495 -12111150 -12902470 -15655081
## [403] -14709049 -46698201 -32045478 -19099575 -22103925 -13023435
## [409] -21744505 -12110475 -23072084 -18168188 -18562748 -13667872
## [415] -13294176 -16838141 -15427201 -13018389 -14429391 -44261890
## [421] -37790525 -14175200 -12932614 -13129797 -13704791 -18682254
## [427] -25872472 -18398747 -14280919 -18953733 -13623254 -13717891
## [433] -13454238 -12257898 -15000882 -12859594 -23441442 -13428714
## [439] -13665917 -12371794 -13551189 -18015551 -26448393 -13293590
## [445] -13706998 -13460674 -15041022 -33039262 -13256939 -20262386
## [451] -14023370 -12892207 -12357426 -16603473
function for computing the values of Q1 and Q3
q1q3 <- function(x) {
sorted <- sort(x, decreasing=F)
# n observations
n <- length(sorted)
# index of median
# ceiling() chooses the upper value if remainder results from floor division
median_index <- ceiling(n / 2)
# first quartile
q1_index <- ceiling(median_index / 2)
q1 <- ifelse(median_index %% 2 == 0,
(sorted[q1_index] + sorted[q1_index + 1]) / 2,
# else
sorted[q1_index])
# third quartile
# -1 such that indices are not skewed
q3_index <- median_index + q1_index - 1
q3 <- ifelse(median_index %% 2 == 0,
(sorted[q3_index] + sorted[q3_index + 1]) / 2,
# else
sorted[q3_index])
t# return results for q1 & q3
return(list(q1 = q1, q3 = q3))
}
q1q3(RBI$DailyRev)
## $q1
## [1] -5997934
##
## $q3
## [1] -1931826
q3 <- quantile(sort(RBI$DailyRev), 0.75)
q1 <- quantile(sort(RBI$DailyRev), 0.25)
q1_q3 <- IQR(sort(RBI$DailyRev))
# reomoving values if they fall out of interquartile range by 1.5 times the interquartile range
# ! negates the logical expression, selecting every row where the condition does NOT apply
no_outliers <- RBI[!RBI$DailyRev < (q1-1.5*q1_q3) | RBI$DailyRev > (q3+1.5*q1_q3), ]
no_outliers
## # A tibble: 4,031 × 11
## Date Open High Low Close Adj C…¹ Volume month year Daily…² Daily…³
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2005-05-02 38.4 38.9 38.2 38.5 19.4 354371 May 2005 -6.74e6 -0.216
## 2 2005-05-03 38.5 38.6 38.1 38.3 19.4 236774 May 2005 -4.53e6 0.123
## 3 2005-05-04 38.3 38.3 38.1 38.2 19.3 190578 May 2005 -3.62e6 0.263
## 4 2005-05-05 38.2 38.2 38.2 38.2 19.3 0 May 2005 0 0.821
## 5 2005-05-09 37.7 38.1 37.4 37.6 19.0 223719 May 2005 -4.19e6 0.177
## 6 2005-05-10 37.7 38.5 37.6 38.5 19.4 327572 May 2005 -5.99e6 -0.101
## 7 2005-05-11 38.4 39.6 38.4 38.8 19.6 358542 May 2005 -6.73e6 -0.215
## 8 2005-05-12 39.3 40.8 39.3 40.8 20.6 384865 May 2005 -7.18e6 -0.284
## 9 2005-05-13 40.2 40.8 39.9 40.6 20.5 354108 May 2005 -6.98e6 -0.253
## 10 2005-05-16 40.6 40.6 40.6 40.6 20.5 0 May 2005 0 0.821
## # … with 4,021 more rows, and abbreviated variable names ¹`Adj Close`,
## # ²DailyRev, ³DailyRevNorm
ggplot(data=no_outliers, aes(x=Date, y=DailyRev, col=factor(year))) +
geom_line() +
theme_minimal()
#scale_color_brewer(palette="Spectral")
ggplot(data=no_outliers, aes(x=Date, y=DailyRevNorm, color=month)) +
geom_line() +
facet_wrap(~month) #+ scale_color_brewer(palette="Spectral")
outliers per month
# binary classification, whether revenue is an outlier or not
RBI <- RBI %>% mutate(outlier=ifelse(abs(DailyRevNorm) > 3, 1, 0))
RBI %>%
dplyr::group_by(month) %>%
drop_na() %>%
summarize(total_outliers = sum(outlier)) %>%
arrange(-total_outliers)
## # A tibble: 12 × 2
## month total_outliers
## <chr> <dbl>
## 1 January 13
## 2 October 13
## 3 May 11
## 4 September 11
## 5 March 10
## 6 February 8
## 7 April 7
## 8 August 6
## 9 July 5
## 10 November 3
## 11 June 2
## 12 December 0
time, when most outliers occured
RBI <- RBI %>% mutate(MY=paste(month, year, sep='/'))
RBI %>%
dplyr::group_by(MY) %>%
summarize(total_outliers = sum(outlier)) %>%
arrange(-total_outliers) %>%
head(n=10)
## # A tibble: 10 × 2
## MY total_outliers
## <chr> <dbl>
## 1 January/2008 10
## 2 September/2008 9
## 3 October/2008 8
## 4 May/2007 7
## 5 July/2008 5
## 6 March/2007 5
## 7 August/2007 4
## 8 October/2007 4
## 9 April/2005 3
## 10 April/2008 3
#CIA <- read_csv("export.csv", sep=';')
#CIA
countries <- c("AT", "DE")
pop <- c(9000000, 79000000)
public_debt <- c( 99.91, 63.90)
unrate <- c(6.3, 3.54)
gdp <- c()
comparison <- data.frame(countries, pop, unrate)
comparison
## countries pop unrate
## 1 AT 9.0e+06 6.30
## 2 DE 7.9e+07 3.54
#barplot(data=comparison[, 2])
ggplot(comparison, aes(x = countries, y = pop)) +
geom_bar(stat = "identity", fill = "blue") +
labs(x = "Country", y = "Population Size") +
ggtitle("Population Sizes of Germany and Austria")
#import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt
#import requests
#url = "https://www.imf.org/external/datamapper/api/v1/NGDP_RPCH/AUT/DEU?year=2023&yaxis=lin"
#response = requests.get(url)
#data = response.json()
#df = pd.read_json(url)
#data
def loop(msg):
dic = {}
for l in msg:
if l not in dic:
dic[l] = 1
else:
dic[l] += 1
ls = []
for i in range(len(dic)):
ls.append(i)
print(msg)
return dic, ls
loop('hello world!')
## hello world!
## ({'h': 1, 'e': 1, 'l': 3, 'o': 2, ' ': 1, 'w': 1, 'r': 1, 'd': 1, '!': 1}, [0, 1, 2, 3, 4, 5, 6, 7, 8])
#happy <- read_csv("https://raw.githubusercontent.com/etadpu/open_source_data/main/happiness-cantril-ladder.csv")
happy <- read_csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise1/happiness-cantril-ladder.csv")
## Rows: 2381 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Cantril ladder score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# clean column names
colnames(happy) <- make_clean_names(colnames(happy))
happy
## # A tibble: 2,381 × 4
## entity code year cantril_ladder_score
## <chr> <chr> <dbl> <dbl>
## 1 Afghanistan AFG 2006 3.72
## 2 Afghanistan AFG 2007 4.40
## 3 Afghanistan AFG 2008 4.76
## 4 Afghanistan AFG 2009 3.83
## 5 Afghanistan AFG 2010 3.78
## 6 Afghanistan AFG 2011 3.57
## 7 Afghanistan AFG 2012 3.13
## 8 Afghanistan AFG 2013 3.98
## 9 Afghanistan AFG 2014 4.22
## 10 Afghanistan AFG 2015 2.66
## # … with 2,371 more rows
# extracting relevant years form panel data
# remove columns 2 & 3 s.t. only the country and the happyness score remain
# sorting all columns by happyness score in dscending order (-)
#h2010 <- happy[happy$year == 2010 , ] %>% arrange(-cantril_ladder_score)
#h2020 <- happy[happy$year == 2020, ] %>% arrange(-cantril_ladder_score)
h2010 <- happy[happy$year == 2010 , -(2:3)] %>% arrange(-cantril_ladder_score) %>% invisible()
h2020 <- happy[happy$year == 2020, -(2:3)] %>% arrange(-cantril_ladder_score) %>% invisible()
# renaming column name
colnames(h2010)[2] <- "score2010"
colnames(h2020)[2] <- "score2020"
# selecting the top & bottom 5 rows
head(h2010, 5); top_n(h2010, -5)
## # A tibble: 5 × 2
## entity score2010
## <chr> <dbl>
## 1 Switzerland 7.78
## 2 Norway 7.68
## 3 Iceland 7.59
## 4 Sweden 7.56
## 5 Denmark 7.52
## Selecting by score2010
## # A tibble: 5 × 2
## entity score2010
## <chr> <dbl>
## 1 Guinea 3.65
## 2 Madagascar 3.55
## 3 Rwanda 3.33
## 4 Benin 3.19
## 5 Syria 3.16
head(h2020, 5); top_n(h2020, -5)
## # A tibble: 5 × 2
## entity score2020
## <chr> <dbl>
## 1 Finland 7.82
## 2 Denmark 7.64
## 3 Iceland 7.56
## 4 Switzerland 7.51
## 5 Netherlands 7.41
## Selecting by score2020
## # A tibble: 5 × 2
## entity score2020
## <chr> <dbl>
## 1 Botswana 3.47
## 2 Rwanda 3.27
## 3 Zimbabwe 3.00
## 4 Lebanon 2.96
## 5 Afghanistan 2.40
# merging both dataframes on the country column
h_2010_2020 <- merge(h2010, h2020, by="entity", all.x = TRUE)
h_2010_2020
## entity score2010 score2020
## 1 Afghanistan 3.782938 2.4038
## 2 Albania 5.510124 5.1988
## 3 Algeria 5.604596 5.1223
## 4 Angola 4.360250 NA
## 5 Argentina 6.468387 5.9670
## 6 Armenia 4.319712 5.3986
## 7 Australia 7.195586 7.1621
## 8 Austria 7.400689 7.1630
## 9 Azerbaijan 4.910772 5.1734
## 10 Bahrain 5.027187 6.6469
## 11 Bangladesh 4.724444 5.1555
## 12 Belarus 5.749043 5.8215
## 13 Belgium 6.935122 6.8050
## 14 Benin 3.193469 4.6232
## 15 Bolivia 6.018895 5.6003
## 16 Bosnia and Herzegovina 4.773145 5.7680
## 17 Botswana 4.835939 3.4711
## 18 Brazil 6.660004 6.2928
## 19 Bulgaria 4.222297 5.3709
## 20 Burkina Faso 3.955008 4.6705
## 21 Cambodia 3.898707 4.6403
## 22 Cameroon 4.244634 5.0476
## 23 Canada 7.415144 7.0251
## 24 Chad 4.032975 4.2508
## 25 Chile 6.599129 6.1718
## 26 China 5.094917 5.5853
## 27 Colombia 6.374880 5.7806
## 28 Comoros 3.955640 4.6086
## 29 Congo 3.919342 5.0752
## 30 Costa Rica 7.272250 6.5820
## 31 Croatia 6.027635 6.1252
## 32 Cyprus 6.180507 6.2209
## 33 Czechia 6.334149 6.9198
## 34 Democratic Republic of Congo 4.639227 NA
## 35 Denmark 7.519909 7.6362
## 36 Dominican Republic 4.753311 5.7372
## 37 Ecuador 5.960716 5.5333
## 38 Egypt 4.204157 4.2877
## 39 El Salvador 5.934372 6.1200
## 40 Estonia 5.363928 6.3408
## 41 Ethiopia 4.561169 4.2415
## 42 Finland 7.420209 7.8210
## 43 France 6.649365 6.6867
## 44 Gabon 3.972059 4.9583
## 45 Georgia 4.254446 4.9732
## 46 Germany 6.702362 7.0341
## 47 Ghana 5.057262 4.8720
## 48 Greece 5.096354 5.9482
## 49 Guatemala 5.855717 6.2622
## 50 Guinea 3.651555 4.8905
## 51 Haiti 4.413475 NA
## 52 Honduras 4.602218 6.0221
## 53 Hong Kong 5.483765 5.4248
## 54 Hungary 4.683358 6.0859
## 55 Iceland 7.590660 7.5575
## 56 India 4.720147 3.7771
## 57 Indonesia 5.367774 5.2396
## 58 Iran 4.608928 4.8879
## 59 Iraq 4.659509 4.9409
## 60 Ireland 6.964645 7.0408
## 61 Israel 7.110855 7.3638
## 62 Italy 5.839314 6.4667
## 63 Japan 5.968216 6.0389
## 64 Jordan 5.131996 4.1521
## 65 Kazakhstan 5.759469 6.2341
## 66 Kenya 4.547335 4.5430
## 67 Kosovo 5.639588 6.4551
## 68 Kuwait 6.221095 6.1061
## 69 Kyrgyzstan 5.207786 5.8285
## 70 Laos 4.876085 5.1403
## 71 Latvia 5.125025 6.1804
## 72 Lebanon 4.572567 2.9553
## 73 Libya 5.754394 5.3302
## 74 Lithuania 5.771037 6.4456
## 75 Luxembourg 6.964097 7.4040
## 76 Madagascar 3.550610 4.3391
## 77 Malawi 4.279270 3.7504
## 78 Malaysia 5.914284 5.7113
## 79 Mali 4.313017 4.4790
## 80 Malta 5.962872 6.4469
## 81 Mauritania 4.673204 4.1526
## 82 Mexico 7.320185 6.1276
## 83 Moldova 5.995713 5.8568
## 84 Mongolia 4.885150 5.7607
## 85 Montenegro 5.218724 5.5468
## 86 Morocco 4.969657 5.0599
## 87 Myanmar 4.438940 4.3942
## 88 Nepal 4.233245 5.3765
## 89 Netherlands 7.470715 7.4149
## 90 New Zealand 7.249630 7.1998
## 91 Nicaragua 5.448006 6.1646
## 92 Niger 3.798088 5.0035
## 93 Nigeria 5.492954 4.5520
## 94 North Macedonia 4.639647 5.1990
## 95 Northern Cyprus 5.463305 5.4666
## 96 Norway 7.678277 7.3651
## 97 Pakistan 5.131565 4.5158
## 98 Palestine 4.646608 4.4825
## 99 Panama 6.859836 6.3091
## 100 Paraguay 5.820058 5.5783
## 101 Peru 5.824557 5.5591
## 102 Philippines 5.001965 5.9036
## 103 Poland 5.875932 6.1228
## 104 Portugal 4.993962 6.0163
## 105 Qatar 6.611299 NA
## 106 Romania 5.166875 6.4770
## 107 Russia 5.620736 5.4586
## 108 Rwanda 3.333048 3.2682
## 109 Saudi Arabia 6.396359 6.5228
## 110 Senegal 3.668737 5.0461
## 111 Serbia 5.154522 6.1780
## 112 Slovakia 5.911059 6.3907
## 113 Slovenia 6.062891 6.6301
## 114 Somaliland region 5.057314 NA
## 115 South Africa 5.133888 5.1939
## 116 South Korea 6.003287 5.9351
## 117 Spain 6.290690 6.4763
## 118 Sri Lanka 4.224593 4.3622
## 119 Sudan 4.550499 NA
## 120 Suriname 6.269287 NA
## 121 Sweden 7.560148 7.3843
## 122 Switzerland 7.776209 7.5116
## 123 Syria 3.164491 NA
## 124 Taiwan 6.125917 6.5125
## 125 Tajikistan 4.496571 5.3771
## 126 Tanzania 4.006897 3.7024
## 127 Thailand 6.300235 5.8912
## 128 Tunisia 4.463531 4.5160
## 129 Turkey 5.309076 4.7442
## 130 Turkmenistan 5.463827 5.4743
## 131 Uganda 4.309238 4.6026
## 132 Ukraine 5.030342 5.0841
## 133 United Arab Emirates 7.217767 6.5760
## 134 United Kingdom 6.880784 6.9425
## 135 United States 7.026227 6.9768
## 136 Uruguay 6.449728 6.4736
## 137 Uzbekistan 6.019332 6.0627
## 138 Venezuela 7.066577 4.9255
## 139 Vietnam 5.534570 5.4850
## 140 Yemen 4.060601 4.1969
## 141 Zambia 5.013375 3.7598
## 142 Zimbabwe 4.955101 2.9951
# other option: joining with SQL statement
# renaming colnames s.t. we remove ambiguity
# (joinin won't work if columns have the same names)
colnames(h2010) <- c("entity2010", "score2010")
colnames(h2020) <- c("entity2020","score2020")
library(sqldf)
sqldf("
SELECT *
FROM h2010
LEFT JOIN h2020
ON entity2010 = entity2020
")
## entity2010 score2010 entity2020 score2020
## 1 Switzerland 7.776209 Switzerland 7.5116
## 2 Norway 7.678277 Norway 7.3651
## 3 Iceland 7.590660 Iceland 7.5575
## 4 Sweden 7.560148 Sweden 7.3843
## 5 Denmark 7.519909 Denmark 7.6362
## 6 Netherlands 7.470715 Netherlands 7.4149
## 7 Finland 7.420209 Finland 7.8210
## 8 Canada 7.415144 Canada 7.0251
## 9 Austria 7.400689 Austria 7.1630
## 10 Mexico 7.320185 Mexico 6.1276
## 11 Costa Rica 7.272250 Costa Rica 6.5820
## 12 New Zealand 7.249630 New Zealand 7.1998
## 13 United Arab Emirates 7.217767 United Arab Emirates 6.5760
## 14 Australia 7.195586 Australia 7.1621
## 15 Israel 7.110855 Israel 7.3638
## 16 Venezuela 7.066577 Venezuela 4.9255
## 17 United States 7.026227 United States 6.9768
## 18 Ireland 6.964645 Ireland 7.0408
## 19 Luxembourg 6.964097 Luxembourg 7.4040
## 20 Belgium 6.935122 Belgium 6.8050
## 21 United Kingdom 6.880784 United Kingdom 6.9425
## 22 Panama 6.859836 Panama 6.3091
## 23 Germany 6.702362 Germany 7.0341
## 24 Brazil 6.660004 Brazil 6.2928
## 25 France 6.649365 France 6.6867
## 26 Qatar 6.611299 <NA> NA
## 27 Chile 6.599129 Chile 6.1718
## 28 Argentina 6.468387 Argentina 5.9670
## 29 Uruguay 6.449728 Uruguay 6.4736
## 30 Saudi Arabia 6.396359 Saudi Arabia 6.5228
## 31 Colombia 6.374880 Colombia 5.7806
## 32 Czechia 6.334149 Czechia 6.9198
## 33 Thailand 6.300235 Thailand 5.8912
## 34 Spain 6.290690 Spain 6.4763
## 35 Suriname 6.269287 <NA> NA
## 36 Kuwait 6.221095 Kuwait 6.1061
## 37 Cyprus 6.180507 Cyprus 6.2209
## 38 Taiwan 6.125917 Taiwan 6.5125
## 39 Slovenia 6.062891 Slovenia 6.6301
## 40 Croatia 6.027635 Croatia 6.1252
## 41 Uzbekistan 6.019332 Uzbekistan 6.0627
## 42 Bolivia 6.018895 Bolivia 5.6003
## 43 South Korea 6.003287 South Korea 5.9351
## 44 Moldova 5.995713 Moldova 5.8568
## 45 Japan 5.968216 Japan 6.0389
## 46 Malta 5.962872 Malta 6.4469
## 47 Ecuador 5.960716 Ecuador 5.5333
## 48 El Salvador 5.934372 El Salvador 6.1200
## 49 Malaysia 5.914284 Malaysia 5.7113
## 50 Slovakia 5.911059 Slovakia 6.3907
## 51 Poland 5.875932 Poland 6.1228
## 52 Guatemala 5.855717 Guatemala 6.2622
## 53 Italy 5.839314 Italy 6.4667
## 54 Peru 5.824557 Peru 5.5591
## 55 Paraguay 5.820058 Paraguay 5.5783
## 56 Lithuania 5.771037 Lithuania 6.4456
## 57 Kazakhstan 5.759469 Kazakhstan 6.2341
## 58 Libya 5.754394 Libya 5.3302
## 59 Belarus 5.749043 Belarus 5.8215
## 60 Kosovo 5.639588 Kosovo 6.4551
## 61 Russia 5.620736 Russia 5.4586
## 62 Algeria 5.604596 Algeria 5.1223
## 63 Vietnam 5.534570 Vietnam 5.4850
## 64 Albania 5.510124 Albania 5.1988
## 65 Nigeria 5.492954 Nigeria 4.5520
## 66 Hong Kong 5.483765 Hong Kong 5.4248
## 67 Turkmenistan 5.463827 Turkmenistan 5.4743
## 68 Northern Cyprus 5.463305 Northern Cyprus 5.4666
## 69 Nicaragua 5.448006 Nicaragua 6.1646
## 70 Indonesia 5.367774 Indonesia 5.2396
## 71 Estonia 5.363928 Estonia 6.3408
## 72 Turkey 5.309076 Turkey 4.7442
## 73 Montenegro 5.218724 Montenegro 5.5468
## 74 Kyrgyzstan 5.207786 Kyrgyzstan 5.8285
## 75 Romania 5.166875 Romania 6.4770
## 76 Serbia 5.154522 Serbia 6.1780
## 77 South Africa 5.133888 South Africa 5.1939
## 78 Jordan 5.131996 Jordan 4.1521
## 79 Pakistan 5.131565 Pakistan 4.5158
## 80 Latvia 5.125025 Latvia 6.1804
## 81 Greece 5.096354 Greece 5.9482
## 82 China 5.094917 China 5.5853
## 83 Somaliland region 5.057314 <NA> NA
## 84 Ghana 5.057262 Ghana 4.8720
## 85 Ukraine 5.030342 Ukraine 5.0841
## 86 Bahrain 5.027187 Bahrain 6.6469
## 87 Zambia 5.013375 Zambia 3.7598
## 88 Philippines 5.001965 Philippines 5.9036
## 89 Portugal 4.993962 Portugal 6.0163
## 90 Morocco 4.969657 Morocco 5.0599
## 91 Zimbabwe 4.955101 Zimbabwe 2.9951
## 92 Azerbaijan 4.910772 Azerbaijan 5.1734
## 93 Mongolia 4.885150 Mongolia 5.7607
## 94 Laos 4.876085 Laos 5.1403
## 95 Botswana 4.835939 Botswana 3.4711
## 96 Bosnia and Herzegovina 4.773145 Bosnia and Herzegovina 5.7680
## 97 Dominican Republic 4.753311 Dominican Republic 5.7372
## 98 Bangladesh 4.724444 Bangladesh 5.1555
## 99 India 4.720147 India 3.7771
## 100 Hungary 4.683358 Hungary 6.0859
## 101 Mauritania 4.673204 Mauritania 4.1526
## 102 Iraq 4.659509 Iraq 4.9409
## 103 Palestine 4.646608 Palestine 4.4825
## 104 North Macedonia 4.639647 North Macedonia 5.1990
## 105 Democratic Republic of Congo 4.639227 <NA> NA
## 106 Iran 4.608928 Iran 4.8879
## 107 Honduras 4.602218 Honduras 6.0221
## 108 Lebanon 4.572567 Lebanon 2.9553
## 109 Ethiopia 4.561169 Ethiopia 4.2415
## 110 Sudan 4.550499 <NA> NA
## 111 Kenya 4.547335 Kenya 4.5430
## 112 Tajikistan 4.496571 Tajikistan 5.3771
## 113 Tunisia 4.463531 Tunisia 4.5160
## 114 Myanmar 4.438940 Myanmar 4.3942
## 115 Haiti 4.413475 <NA> NA
## 116 Angola 4.360250 <NA> NA
## 117 Armenia 4.319712 Armenia 5.3986
## 118 Mali 4.313017 Mali 4.4790
## 119 Uganda 4.309238 Uganda 4.6026
## 120 Malawi 4.279270 Malawi 3.7504
## 121 Georgia 4.254446 Georgia 4.9732
## 122 Cameroon 4.244634 Cameroon 5.0476
## 123 Nepal 4.233245 Nepal 5.3765
## 124 Sri Lanka 4.224593 Sri Lanka 4.3622
## 125 Bulgaria 4.222297 Bulgaria 5.3709
## 126 Egypt 4.204157 Egypt 4.2877
## 127 Yemen 4.060601 Yemen 4.1969
## 128 Chad 4.032975 Chad 4.2508
## 129 Tanzania 4.006897 Tanzania 3.7024
## 130 Gabon 3.972059 Gabon 4.9583
## 131 Comoros 3.955640 Comoros 4.6086
## 132 Burkina Faso 3.955008 Burkina Faso 4.6705
## 133 Congo 3.919342 Congo 5.0752
## 134 Cambodia 3.898707 Cambodia 4.6403
## 135 Niger 3.798088 Niger 5.0035
## 136 Afghanistan 3.782938 Afghanistan 2.4038
## 137 Senegal 3.668737 Senegal 5.0461
## 138 Guinea 3.651555 Guinea 4.8905
## 139 Madagascar 3.550610 Madagascar 4.3391
## 140 Rwanda 3.333048 Rwanda 3.2682
## 141 Benin 3.193469 Benin 4.6232
## 142 Syria 3.164491 <NA> NA
h_2010_2020 <- h_2010_2020 %>%
mutate(neg_difference = (score2020-score2010),
pos_difference = (score2020-score2010))
# negative difference = happyness score decreased from 2010 to 2020
h_2010_2020 %>% select(entity, neg_difference) %>% arrange(neg_difference);
## entity neg_difference
## 1 Venezuela -2.1410774
## 2 Zimbabwe -1.9600005
## 3 Lebanon -1.6172670
## 4 Afghanistan -1.3791375
## 5 Botswana -1.3648390
## 6 Zambia -1.2535750
## 7 Mexico -1.1925850
## 8 Jordan -0.9798960
## 9 India -0.9430467
## 10 Nigeria -0.9409543
## 11 Costa Rica -0.6902500
## 12 United Arab Emirates -0.6417670
## 13 Pakistan -0.6157650
## 14 Colombia -0.5942800
## 15 Turkey -0.5648763
## 16 Panama -0.5507356
## 17 Malawi -0.5288697
## 18 Mauritania -0.5206040
## 19 Argentina -0.5013870
## 20 Algeria -0.4822957
## 21 Ecuador -0.4274162
## 22 Chile -0.4273287
## 23 Libya -0.4241940
## 24 Bolivia -0.4185947
## 25 Thailand -0.4090353
## 26 Canada -0.3900444
## 27 Brazil -0.3672037
## 28 Ethiopia -0.3196687
## 29 Norway -0.3131770
## 30 Albania -0.3113240
## 31 Tanzania -0.3044974
## 32 Peru -0.2654573
## 33 Switzerland -0.2646090
## 34 Paraguay -0.2417583
## 35 Austria -0.2376886
## 36 Malaysia -0.2029838
## 37 Ghana -0.1852620
## 38 Sweden -0.1758480
## 39 Palestine -0.1641084
## 40 Russia -0.1621356
## 41 Moldova -0.1389128
## 42 Belgium -0.1301220
## 43 Indonesia -0.1281740
## 44 Kuwait -0.1149946
## 45 South Korea -0.0681870
## 46 Rwanda -0.0648479
## 47 Hong Kong -0.0589646
## 48 Netherlands -0.0558155
## 49 New Zealand -0.0498300
## 50 Vietnam -0.0495697
## 51 United States -0.0494270
## 52 Myanmar -0.0447396
## 53 Australia -0.0334857
## 54 Iceland -0.0331600
## 55 Kenya -0.0043350
## 56 Northern Cyprus 0.0032945
## 57 Turkmenistan 0.0104730
## 58 Uruguay 0.0238715
## 59 France 0.0373346
## 60 Cyprus 0.0403930
## 61 Uzbekistan 0.0433680
## 62 Tunisia 0.0524690
## 63 Ukraine 0.0537580
## 64 South Africa 0.0600120
## 65 United Kingdom 0.0617160
## 66 Japan 0.0706836
## 67 Belarus 0.0724565
## 68 Ireland 0.0761546
## 69 Egypt 0.0835430
## 70 Morocco 0.0902435
## 71 Croatia 0.0975654
## 72 Denmark 0.1162906
## 73 Saudi Arabia 0.1264406
## 74 Yemen 0.1362992
## 75 Sri Lanka 0.1376070
## 76 Mali 0.1659830
## 77 Spain 0.1856096
## 78 El Salvador 0.1856285
## 79 Chad 0.2178253
## 80 Poland 0.2468683
## 81 Israel 0.2529454
## 82 Azerbaijan 0.2626280
## 83 Laos 0.2642150
## 84 Iran 0.2789723
## 85 Iraq 0.2813913
## 86 Uganda 0.2933620
## 87 Montenegro 0.3280757
## 88 Germany 0.3317380
## 89 Taiwan 0.3865830
## 90 Finland 0.4007906
## 91 Guatemala 0.4064830
## 92 Bangladesh 0.4310560
## 93 Luxembourg 0.4399030
## 94 Kazakhstan 0.4746305
## 95 Slovakia 0.4796406
## 96 Malta 0.4840280
## 97 China 0.4903827
## 98 North Macedonia 0.5593525
## 99 Slovenia 0.5672090
## 100 Czechia 0.5856506
## 101 Kyrgyzstan 0.6207144
## 102 Italy 0.6273860
## 103 Comoros 0.6529597
## 104 Lithuania 0.6745630
## 105 Burkina Faso 0.7154920
## 106 Nicaragua 0.7165940
## 107 Georgia 0.7187544
## 108 Cambodia 0.7415930
## 109 Madagascar 0.7884904
## 110 Cameroon 0.8029660
## 111 Kosovo 0.8155116
## 112 Greece 0.8518460
## 113 Mongolia 0.8755496
## 114 Tajikistan 0.8805285
## 115 Philippines 0.9016350
## 116 Estonia 0.9768720
## 117 Dominican Republic 0.9838890
## 118 Gabon 0.9862408
## 119 Bosnia and Herzegovina 0.9948553
## 120 Portugal 1.0223377
## 121 Serbia 1.0234780
## 122 Latvia 1.0553747
## 123 Armenia 1.0788883
## 124 Nepal 1.1432550
## 125 Bulgaria 1.1486030
## 126 Congo 1.1558582
## 127 Niger 1.2054117
## 128 Guinea 1.2389452
## 129 Romania 1.3101250
## 130 Senegal 1.3773630
## 131 Hungary 1.4025420
## 132 Honduras 1.4198820
## 133 Benin 1.4297312
## 134 Bahrain 1.6197130
## 135 Angola NA
## 136 Democratic Republic of Congo NA
## 137 Haiti NA
## 138 Qatar NA
## 139 Somaliland region NA
## 140 Sudan NA
## 141 Suriname NA
## 142 Syria NA
# positive difference = happyness score increased from 2010 to 2020
h_2010_2020 %>% select(entity, pos_difference) %>% arrange(-pos_difference)
## entity pos_difference
## 1 Bahrain 1.6197130
## 2 Benin 1.4297312
## 3 Honduras 1.4198820
## 4 Hungary 1.4025420
## 5 Senegal 1.3773630
## 6 Romania 1.3101250
## 7 Guinea 1.2389452
## 8 Niger 1.2054117
## 9 Congo 1.1558582
## 10 Bulgaria 1.1486030
## 11 Nepal 1.1432550
## 12 Armenia 1.0788883
## 13 Latvia 1.0553747
## 14 Serbia 1.0234780
## 15 Portugal 1.0223377
## 16 Bosnia and Herzegovina 0.9948553
## 17 Gabon 0.9862408
## 18 Dominican Republic 0.9838890
## 19 Estonia 0.9768720
## 20 Philippines 0.9016350
## 21 Tajikistan 0.8805285
## 22 Mongolia 0.8755496
## 23 Greece 0.8518460
## 24 Kosovo 0.8155116
## 25 Cameroon 0.8029660
## 26 Madagascar 0.7884904
## 27 Cambodia 0.7415930
## 28 Georgia 0.7187544
## 29 Nicaragua 0.7165940
## 30 Burkina Faso 0.7154920
## 31 Lithuania 0.6745630
## 32 Comoros 0.6529597
## 33 Italy 0.6273860
## 34 Kyrgyzstan 0.6207144
## 35 Czechia 0.5856506
## 36 Slovenia 0.5672090
## 37 North Macedonia 0.5593525
## 38 China 0.4903827
## 39 Malta 0.4840280
## 40 Slovakia 0.4796406
## 41 Kazakhstan 0.4746305
## 42 Luxembourg 0.4399030
## 43 Bangladesh 0.4310560
## 44 Guatemala 0.4064830
## 45 Finland 0.4007906
## 46 Taiwan 0.3865830
## 47 Germany 0.3317380
## 48 Montenegro 0.3280757
## 49 Uganda 0.2933620
## 50 Iraq 0.2813913
## 51 Iran 0.2789723
## 52 Laos 0.2642150
## 53 Azerbaijan 0.2626280
## 54 Israel 0.2529454
## 55 Poland 0.2468683
## 56 Chad 0.2178253
## 57 El Salvador 0.1856285
## 58 Spain 0.1856096
## 59 Mali 0.1659830
## 60 Sri Lanka 0.1376070
## 61 Yemen 0.1362992
## 62 Saudi Arabia 0.1264406
## 63 Denmark 0.1162906
## 64 Croatia 0.0975654
## 65 Morocco 0.0902435
## 66 Egypt 0.0835430
## 67 Ireland 0.0761546
## 68 Belarus 0.0724565
## 69 Japan 0.0706836
## 70 United Kingdom 0.0617160
## 71 South Africa 0.0600120
## 72 Ukraine 0.0537580
## 73 Tunisia 0.0524690
## 74 Uzbekistan 0.0433680
## 75 Cyprus 0.0403930
## 76 France 0.0373346
## 77 Uruguay 0.0238715
## 78 Turkmenistan 0.0104730
## 79 Northern Cyprus 0.0032945
## 80 Kenya -0.0043350
## 81 Iceland -0.0331600
## 82 Australia -0.0334857
## 83 Myanmar -0.0447396
## 84 United States -0.0494270
## 85 Vietnam -0.0495697
## 86 New Zealand -0.0498300
## 87 Netherlands -0.0558155
## 88 Hong Kong -0.0589646
## 89 Rwanda -0.0648479
## 90 South Korea -0.0681870
## 91 Kuwait -0.1149946
## 92 Indonesia -0.1281740
## 93 Belgium -0.1301220
## 94 Moldova -0.1389128
## 95 Russia -0.1621356
## 96 Palestine -0.1641084
## 97 Sweden -0.1758480
## 98 Ghana -0.1852620
## 99 Malaysia -0.2029838
## 100 Austria -0.2376886
## 101 Paraguay -0.2417583
## 102 Switzerland -0.2646090
## 103 Peru -0.2654573
## 104 Tanzania -0.3044974
## 105 Albania -0.3113240
## 106 Norway -0.3131770
## 107 Ethiopia -0.3196687
## 108 Brazil -0.3672037
## 109 Canada -0.3900444
## 110 Thailand -0.4090353
## 111 Bolivia -0.4185947
## 112 Libya -0.4241940
## 113 Chile -0.4273287
## 114 Ecuador -0.4274162
## 115 Algeria -0.4822957
## 116 Argentina -0.5013870
## 117 Mauritania -0.5206040
## 118 Malawi -0.5288697
## 119 Panama -0.5507356
## 120 Turkey -0.5648763
## 121 Colombia -0.5942800
## 122 Pakistan -0.6157650
## 123 United Arab Emirates -0.6417670
## 124 Costa Rica -0.6902500
## 125 Nigeria -0.9409543
## 126 India -0.9430467
## 127 Jordan -0.9798960
## 128 Mexico -1.1925850
## 129 Zambia -1.2535750
## 130 Botswana -1.3648390
## 131 Afghanistan -1.3791375
## 132 Lebanon -1.6172670
## 133 Zimbabwe -1.9600005
## 134 Venezuela -2.1410774
## 135 Angola NA
## 136 Democratic Republic of Congo NA
## 137 Haiti NA
## 138 Qatar NA
## 139 Somaliland region NA
## 140 Sudan NA
## 141 Suriname NA
## 142 Syria NA
ggplot(diamonds, aes(cut, carat), color=factor(price)) +
geom_point()
ggplot(diamonds, aes(cut, carat), color=factor(price)) +
geom_line() +
facet_wrap(~depth)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
death <- read.csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise1/Death_Causes_Eurostat.csv")
pie(x=death[,1], main="EU27 deaths")
legend("topright", inset = 0.05, title = "Legend", legend = death$TIME, fill=rainbow(factor(death$TIME)))
for (c in 2:ncol(death)){
if (c==2){
pool <- data.frame(death$TIME, death[, c], rep(colnames(death)[c], 10))
}
else {
pool <- pool %>% rbind(data.frame(death$TIME, death[, c], rep(colnames(death)[c], 10)))
}
}
pool <- pool %>% clean_names()
ggplot(pool, aes(x=pool$death_time) )+
geom_point(aes(y=pool$death_c)) +
facet_wrap(~pool$rep_colnames_death_c_10)
## Warning: Use of `pool$death_c` is discouraged. Use `death_c` instead.
## Warning: Use of `pool$death_time` is discouraged. Use `death_time` instead.
## Warning: Removed 325 rows containing missing values (geom_point).
barplot(height = pool$death_c, width=0.001)
ggplot(pool, aes(x=pool$death_time, y=pool$death_c)) +
geom_point(col=2) +
stat_smooth(method='lm')
## Warning: Use of `pool$death_time` is discouraged. Use `death_time` instead.
## Warning: Use of `pool$death_c` is discouraged. Use `death_c` instead.
## Warning: Use of `pool$death_time` is discouraged. Use `death_time` instead.
## Warning: Use of `pool$death_c` is discouraged. Use `death_c` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 325 rows containing non-finite values (stat_smooth).
## Warning: Removed 325 rows containing missing values (geom_point).